Description

Use historical Draw Results, and number of hunters to train a model we can use to predict the number of future hunters.

TODO - Include other potential inputs that could impact how many hunters get a license and show up. Those could include economic indicators, and costs associated with hunting (transportation, lodging).

NOTICE that I am only looking at the general rifle hunting seasons on public land. There are also hunters for Archery, Muzzleloader, Private Land, Ranching for Wildlife, etc.


Setup

Load required libraries for wrangling data, charting, and mapping

library(plyr,quietly = T) # data wrangling
library(dplyr,quietly = T) # data wrangling
library(ggplot2, quietly = T) # charting
library(ggthemes,quietly = T) # so I can add the highcharts theme and palette
library(scales,quietly = T) # to load the percent function when labeling plots
library(caret,quietly = T) # classification and regression training

Set our preferred charting theme

theme_set(theme_minimal()+theme_hc()+theme(legend.key.width = unit(1.5, "cm")))

Run script to get hunter data

source('~/_code/colorado-dow/datasets/Colorado Elk Harvest Data.R', echo=F)

Table of the harvest data

COElkRifleAll

Run script to get draw data

source('~/_code/colorado-dow/datasets/Elk Drawing Summaries.R', echo=F)

Table of the data

COElkDrawAll
source('~/_code/colorado-dow/datasets/Colorado GMUnit and Road data.R', echo=F)
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/psarnow/_code/colorado-dow/datasets/CPW_GMUBoundaries/BigGameGMUBoundaries03172015.shp", layer: "BigGameGMUBoundaries03172015"
## with 185 features
## It has 12 fields
## Integer64 fields read as strings:  GMUID 
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/psarnow/_code/colorado-dow/datasets/ne_10m_roads/ne_10m_roads.shp", layer: "ne_10m_roads"
## with 56601 features
## It has 29 fields
## Integer64 fields read as strings:  scalerank question

Take a peak at the boundary data

head(Unitboundaries2)

Move back to predictive analytics directory

setwd("~/_code/colorado-dow/phase III - predictive analytics")

Organize data

Will start by grouping all of the seasons together, and modeling the number of hunters per Year and Unit Group Draw results data by Year and Unit

COElkDraw <- summarise(group_by(COElkDrawAll,Year,Unit),
                       Quota = sum(Orig_Quota,na.rm = T),
                       Drawn = sum(Chcs_Drawn,na.rm = T))

Appropriate field classes for model training

COElkDraw$Year <- as.numeric(COElkDraw$Year)

Group Hunter data by Year and Unit

COElkHunters <- summarise(group_by(COElkRifleAll,Year,Unit),
                          Hunters = sum(c(Hunters.Antlered,Hunters.Antlerless,Hunters.Either),na.rm = T))

COElkHunters$Year <- as.numeric(COElkHunters$Year)

Join in Hunter and Draw data together

COElkHunters <- left_join(COElkHunters, COElkDraw, by = c("Year","Unit"))

Replace the draw data that don’t have entries with 0

COElkHunters$Drawn[is.na(COElkHunters$Drawn)] <- 0
COElkHunters$Quota[is.na(COElkHunters$Quota)] <- 0

Split into train and test sets will use 75% of the data to train on

data_index <- sample(1:nrow(COElkHunters),size = .75*nrow(COElkHunters),replace = F)
traindata <- COElkHunters[ data_index, ]
testdata <- COElkHunters[-data_index, ]

Save off for importing into AzureML

write.csv(COElkHunters,file = "~/_code/colorado-dow/datasets/COElkHunters.csv",row.names = F)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 4, #4
  repeats = 10, #20
  # classProbs = TRUE,
  # savePred = TRUE,
  allowParallel = TRUE,
  summaryFunction = defaultSummary)

HuntersModel = train(Hunters ~ ., data = COElkHunters,
                     method = "cubist", #cubist
                     # preProc = c("center", "scale"), 
                     tuneLength = 10,
                     trControl = fitControl)

HuntersModel
## Cubist 
## 
## 1545 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times) 
## Summary of sample sizes: 1159, 1157, 1160, 1159, 1157, 1159, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          580.9252  0.6074965  396.1586
##    1          5          228.5752  0.9379032  129.1305
##    1          9          249.5225  0.9261879  150.8964
##   10          0          504.8875  0.7054539  340.2092
##   10          5          170.6116  0.9660306  107.3733
##   10          9          191.8211  0.9574095  124.7386
##   20          0          503.9181  0.7084681  340.1525
##   20          5          166.5379  0.9676920  105.5659
##   20          9          188.1712  0.9591974  123.0064
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.

Important predictors

ImpPred <- varImp(HuntersModel,scale = T)
ImpPred
## cubist variable importance
## 
##   only 20 most important variables shown (out of 137)
## 
##         Overall
## Quota    100.00
## Drawn     80.35
## Unit81    42.77
## Unit64    38.73
## Unit29    38.15
## Unit201   35.26
## Unit181   34.10
## Year      33.53
## Unit20    32.37
## Unit191   32.37
## Unit57    32.37
## Unit69    31.79
## Unit46    31.79
## Unit9     31.79
## Unit50    30.64
## Unit391   30.06
## Unit42    29.48
## Unit10    29.48
## Unit62    28.32
## Unit61    28.32
# check performance
predictdata <- predict(HuntersModel, testdata)

postResample(pred = predictdata, obs = testdata$Hunters)
##       RMSE   Rsquared        MAE 
## 95.6355374  0.9881788 62.6447113
# svmRadial RMSE=427
# svmLinear RMSE=363
# cubist RMSE=104

We can iterate the above model by tweaking preprocessing parameters, and model algorithms. Chart performance of predicted

chartperformance <- data.frame(predicted = predictdata, observed = testdata$Hunters)

ggplot(chartperformance, aes(predicted,observed)) +
  geom_point() +
  labs(title="Performance of Number of Hunters Prediction", caption="source: cpw.state.co.us")

After final model type and tuning params have been identified, run the full dataset to utilize in predicting future data. Finalize model with full dataset to train on

FinalHuntersmodel = train(Hunters ~ ., data = COElkHunters,
                          method = "cubist",
                          # preProc = c("center", "scale"), 
                          tuneLength = 10,
                          trControl = fitControl)

FinalHuntersmodel
## Cubist 
## 
## 1545 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times) 
## Summary of sample sizes: 1159, 1158, 1160, 1158, 1158, 1158, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          586.5089  0.6013548  396.5161
##    1          5          221.0819  0.9422353  127.2486
##    1          9          242.0125  0.9311877  147.9662
##   10          0          513.5595  0.6947715  345.1116
##   10          5          165.0443  0.9682473  105.3905
##   10          9          185.8796  0.9600914  122.7733
##   20          0          508.5148  0.7017838  341.5211
##   20          5          162.4275  0.9691986  103.6775
##   20          9          183.2433  0.9612364  120.7936
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.

Important predictors

ImpPred <- varImp(FinalHuntersmodel,scale = T)

Use the 2018 Draw data to predict the number of hunters in 2018

COElkDraw2018 <- filter(COElkDraw, Year == 2018)
COElkHunters2018 <- COElkDraw2018[, colnames(COElkDraw2018) %in% c("Unit",FinalHuntersmodel$coefnames)]

COElkHunters2018$Hunters <- predict(FinalHuntersmodel, COElkHunters2018)

COElkHunters2018$Hunters[COElkHunters2018$Hunters<0] <- 0

Save off so we don’t have to recreate the model everytime we want the results

save(COElkHunters2018,file="COElkHunters2018.RData")

Total Elk Harvest

Statewide

# Group seasons
COElkHuntersStatewide <- summarise(group_by(COElkRifleAll,Year,Unit),
                                  Hunters = sum(c(Hunters.Antlered,Hunters.Antlerless,Hunters.Either),na.rm = T))
COElkHunters2018b <- COElkHunters2018
COElkHunters2018b$Year <- as.character(COElkHunters2018b$Year)

Join 2018 to historic data

COElkHuntersAll <- rbind.fill(COElkHuntersStatewide,COElkHunters2018b)

# Group Units
COElkHuntersStatewide <- summarise(group_by(COElkHuntersAll,Year),
                                   Hunters = sum(Hunters))

ggplot(COElkHuntersStatewide, aes(Year,Hunters)) +
  geom_bar(stat="identity",fill=ggthemes_data$hc$palettes$default[2]) +
  coord_cartesian(ylim = c(110000,160000)) +
  labs(title="Statewide Elk Hunters", caption="source: cpw.state.co.us")

TODO commentary


Hunters by Unit

I’d like to know where the hunters are distributed across the state.

Next year’s data

Year2018 <- filter(COElkHuntersAll, Year == "2018")
HunterstoPlot <- left_join(Unitboundaries2,Year2018, by=c("Unit"))
ggplot(HunterstoPlot, aes(long, lat, group = group)) + 
  geom_polygon(aes(fill = Hunters),colour = "grey50", size = .2) + #Unit boundaries
  geom_path(data = COroads,aes(x = long, y = lat, group = group), color="#3878C7",size=2) + #Roads
  geom_text(data=data_centroids,aes(x=longitude,y=latitude,label = Unit),size=3) + #Unit labels
  scale_fill_distiller(palette = "Oranges",direction = 1,na.value = 'light grey') +
  xlab("") + ylab("") +
  labs(title="Predicted 2018 Colorado Elk Hunters", caption="source: cpw.state.co.us")

TODO - commentary


Number of Hunters Rank of the Units

Would also be beneficial to rank each unit so I can reference later. In this case average the number of hunters of the last few years

HunterRank2018 <- filter(COElkHuntersAll, as.numeric(Year) == 2018)
HunterRank2018 <- summarise(group_by(HunterRank2018,Unit),
                             Hunters = mean(Hunters,na.rm = T))
HunterRank2018$HuntersRank = rank(-HunterRank2018$Hunters)

HunterRank2018 <- filter(HunterRank2018, HuntersRank <= 50) # top 50 units

In order for the chart to retain the order of the rows, the X axis variable (i.e. the categories) has to be converted into a factor.

HunterRank2018 <- HunterRank2018[order(-HunterRank2018$Hunters), ]  # sort
HunterRank2018$Unit <- factor(HunterRank2018$Unit, levels = HunterRank2018$Unit)  # to retain the order in plot.

Lollipop Chart

ggplot(HunterRank2018, aes(x=Unit, y=Hunters)) + 
  geom_point(size=3) + 
  geom_segment(aes(x=Unit, 
                   xend=Unit, 
                   y=0, 
                   yend=Hunters)) + 
  labs(title="Elk Hunters 2018\nTop 50 Units", subtitle="Hunters by Unit", caption="source: cpw.state.co.us")

TODO - commentary


Conclusion

TODO